{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pyspark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "conf = pyspark.SparkConf()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 设置为local模式，在单机模拟环境下，这是默认的模式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<pyspark.conf.SparkConf at 0x1aad0e1f048>"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "conf.setMaster(\"local[*]\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 设置任务名称"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<pyspark.conf.SparkConf at 0x1aad0e1f048>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "conf.setAppName(\"AAA\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc = pyspark.SparkContext(conf=conf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 随机生成十万个字母，A-Z"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "x = [chr(random.randint(65,91)) for i in range(100000)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 显示前100个"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['V', 'C', 'H', 'I', 'K', 'F', 'K', 'B', 'Y', 'N', 'X', 'Y', 'S', 'W', 'K', 'U', 'F', 'Y', 'W', 'K', 'E', 'J', 'Y', 'P', 'J', 'R', 'Y', 'L', 'F', 'Q', 'W', 'D', 'S', 'N', 'Z', 'C', '[', 'N', 'M', 'A', 'P', 'F', 'E', 'D', 'N', 'R', 'C', 'Y', 'G', 'R', 'S', 'L', 'Z', 'A', 'H', 'G', 'E', 'E', 'M', 'L', 'G', 'A', 'Q', 'F', 'Z', 'I', 'B', 'S', 'B', 'Y', 'C', 'O', 'V', 'G', 'Y', 'J', 'F', 'E', 'K', 'D', 'T', 'P', 'M', 'X', '[', 'M', 'I', 'K', 'O', 'Q', 'U', 'S', 'W', 'U', 'C', 'S', 'E', 'A', 'N', 'X']\n"
     ]
    }
   ],
   "source": [
    "print(x[0:100])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "rdd = sc.parallelize(x)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 分类统计聚合，每个字母计数为1，然后看看出现了多少次"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('N', 3760),\n",
       " ('J', 3692),\n",
       " ('[', 3657),\n",
       " ('O', 3738),\n",
       " ('H', 3688),\n",
       " ('B', 3620),\n",
       " ('C', 3626),\n",
       " ('W', 3729),\n",
       " ('R', 3769),\n",
       " ('V', 3719),\n",
       " ('I', 3788),\n",
       " ('X', 3734),\n",
       " ('Z', 3698),\n",
       " ('K', 3661),\n",
       " ('L', 3659),\n",
       " ('F', 3767),\n",
       " ('U', 3771),\n",
       " ('Q', 3711),\n",
       " ('M', 3694),\n",
       " ('A', 3683),\n",
       " ('T', 3649),\n",
       " ('S', 3655),\n",
       " ('Y', 3765),\n",
       " ('E', 3614),\n",
       " ('P', 3731),\n",
       " ('D', 3706),\n",
       " ('G', 3716)]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rdd.map(lambda w : (w,1)).reduceByKey(lambda a,b : a+b).collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
